{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Retrieving, Processing, and Storing Data"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Comparing Numpy .npy binary and pandas pickling"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size CSV file: 36684\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tempfile import NamedTemporaryFile\n",
    "from os.path import getsize\n",
    "\n",
    "np.random.seed(40)\n",
    "a = np.random.randn(365, 4)\n",
    "\n",
    "tmpf = NamedTemporaryFile()\n",
    "np.savetxt(tmpf, a, delimiter=',')\n",
    "print(\"Size CSV file:\", getsize(tmpf.name))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Shape: (365, 4)\n",
      "Size .npy file 11808\n"
     ]
    }
   ],
   "source": [
    "tmpf = NamedTemporaryFile()\n",
    "np.save(tmpf, a)\n",
    "tmpf.seek(0)\n",
    "loaded = np.load(tmpf)\n",
    "print(\"Shape:\", loaded.shape)\n",
    "print(\"Size .npy file\", getsize(tmpf.name))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Size pickled dataframe: 12254\n",
      "DF from pickle\n",
      "             0         1         2         3\n",
      "0   -0.607548 -0.126136 -0.684606  0.928715\n",
      "1   -1.844401 -0.467002  2.292490  0.488810\n",
      "2    0.710267  1.055534  0.054073  0.257953\n",
      "3    0.588282  0.885244 -1.017007 -0.133693\n",
      "4   -0.438186  0.493443 -0.199009 -1.274984\n",
      "5    0.293494  0.108950  0.031727  1.272640\n",
      "6    1.071448  0.415818  1.550679 -0.311379\n",
      "7   -1.379240  1.371409  0.027712 -0.320400\n",
      "8   -0.846170 -0.433429 -1.337035  0.209172\n",
      "9   -1.424321 -0.553477  0.074799 -0.505620\n",
      "10   1.052408  0.971400  0.076832 -0.435001\n",
      "11   0.552994  0.266716  0.008989  0.641103\n",
      "12  -0.177707  0.696278 -1.188725 -0.331697\n",
      "13   0.030076 -1.107915 -0.549925 -2.032910\n",
      "14   1.407918  0.633108  2.212747 -0.526602\n",
      "15   0.542882 -0.084480  1.292015 -0.176711\n",
      "16   1.687787 -1.046614  0.642120 -0.172962\n",
      "17  -1.112065 -0.020703 -1.813522  0.203522\n",
      "18   0.531874 -1.017023 -1.848051  0.172549\n",
      "19   0.786202 -0.071470  0.836349 -0.198229\n",
      "20  -0.092273  0.870726 -0.839192  0.823255\n",
      "21   0.701752  0.598361 -0.507995  0.136138\n",
      "22  -1.492115  0.611666  1.369528 -1.702731\n",
      "23  -0.518781 -1.234703 -2.880995 -0.015723\n",
      "24  -1.692685 -0.619635  0.874325  1.719519\n",
      "25   0.034290 -0.848434  0.268831 -1.741377\n",
      "26   0.126493 -0.154794  0.806941 -0.891311\n",
      "27  -0.064557 -0.136263 -0.173894  1.321633\n",
      "28  -1.568562 -0.578639  0.436006 -0.752909\n",
      "29  -0.924102 -2.437053 -1.588091 -0.928011\n",
      "..        ...       ...       ...       ...\n",
      "335 -0.966062  1.557927  1.247241 -0.245374\n",
      "336  1.954565 -0.805680 -1.195319 -0.060237\n",
      "337  0.485024 -0.096182 -0.907348  0.301056\n",
      "338 -0.306539 -1.418745 -0.824736 -0.057442\n",
      "339  0.531255  1.872637  0.751428 -0.740259\n",
      "340  0.324991  0.817867  1.076038  1.687028\n",
      "341  0.518871 -0.147541  1.584002  0.522067\n",
      "342  0.220567  0.365239  0.592816 -0.539344\n",
      "343 -0.149947  1.133718  0.695652 -0.478403\n",
      "344  0.639750 -0.320823  0.973948 -2.303477\n",
      "345  0.153491 -0.418740 -0.678516  0.995254\n",
      "346 -0.930009  1.207457  1.028136  0.032945\n",
      "347  0.679699  1.492040 -0.404999  0.928465\n",
      "348  0.319291 -0.500540 -1.185521  1.783808\n",
      "349  0.944269  0.328684 -0.997614  0.718727\n",
      "350  1.537388  2.345475  1.028155  0.004297\n",
      "351  2.516721  0.810877 -0.458593  1.064805\n",
      "352  0.270497  0.428377 -0.029495  1.361203\n",
      "353  0.963022  1.466582 -0.891680 -0.199557\n",
      "354  2.555880  1.908917 -1.405039 -1.616769\n",
      "355 -1.454413  0.171307 -1.319554 -1.091206\n",
      "356 -1.667690  0.226658 -0.185454 -0.566786\n",
      "357  0.099711 -0.805140  0.232977 -0.653727\n",
      "358  0.759858  0.379648 -0.706697 -0.555253\n",
      "359 -0.078923 -0.800522 -0.995564 -0.506843\n",
      "360  0.410575  0.513890  1.430660 -0.273739\n",
      "361 -0.845906  1.987754 -0.971906  1.900442\n",
      "362 -0.195155 -0.587276 -0.516338  0.164434\n",
      "363 -1.191338  1.043126 -0.125925 -0.702099\n",
      "364  0.885382  0.684500  1.409991  1.278488\n",
      "\n",
      "[365 rows x 4 columns]\n"
     ]
    }
   ],
   "source": [
    "tmpf = NamedTemporaryFile()\n",
    "df = pd.DataFrame(a)\n",
    "df.to_pickle(tmpf.name)\n",
    "print(\"Size pickled dataframe:\", getsize(tmpf.name))\n",
    "print(\"DF from pickle\\n\", pd.read_pickle(tmpf.name))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Storing data with PyTables(HDF format)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'tables.group.RootGroup'>\n",
      "13728\n",
      "<class 'numpy.ndarray'> (365, 4)\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import tables\n",
    "from tempfile import NamedTemporaryFile\n",
    "from os.path import getsize\n",
    "\n",
    "np.random.seed(42)\n",
    "a = np.random.randn(365, 4)\n",
    "\n",
    "tmpf = NamedTemporaryFile()\n",
    "h5file = tables.open_file(tmpf.name, mode='w', title='Numpy Array')\n",
    "root = h5file.root\n",
    "h5file.create_array(root, \"array\", a)\n",
    "print(type(root))\n",
    "h5file.close()\n",
    "\n",
    "h5file = tables.open_file(tmpf.name, 'r')\n",
    "print(getsize(tmpf.name))\n",
    "\n",
    "for node in h5file.iter_nodes(h5file.root):\n",
    "    b = node.read()\n",
    "    print(type(b), b.shape)\n",
    "\n",
    "h5file.close()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reading and writing pandas DataFrame to HDF5 stores"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.io.pytables.HDFStore'>\n",
      "File path: /var/folders/rd/fxns6vhj1m12_gpmxw405gq40000gn/T/tmpn6zfvs4o\n",
      "Empty\n"
     ]
    }
   ],
   "source": [
    "tmpf = NamedTemporaryFile()\n",
    "store = pd.HDFStore(tmpf.name)\n",
    "print(store)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.io.pytables.HDFStore'>\n",
      "File path: /var/folders/rd/fxns6vhj1m12_gpmxw405gq40000gn/T/tmpp5lne_88\n",
      "/df            frame        (shape->[365,4])\n"
     ]
    }
   ],
   "source": [
    "store['df'] = df\n",
    "print(store)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Read value from store"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "(365, 4)\n",
      "(365, 4)\n",
      "(365, 4)\n"
     ]
    }
   ],
   "source": [
    "print(store.get('df').shape)\n",
    "print(store['df'].shape)\n",
    "print(store.df.shape)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.io.pytables.HDFStore'>\n",
      "File path: /var/folders/rd/fxns6vhj1m12_gpmxw405gq40000gn/T/tmpp5lne_88\n",
      "Empty\n"
     ]
    }
   ],
   "source": [
    "del store['df']\n",
    "print(store)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "- DataFrame支持直接保存为HDF格式文件 DataFrame.to_hdf()\n",
    "- Pandas顶层函数也支持从HDF文件读取 pd.read_hdf()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "            0         1         2         3\n",
      "361 -0.845906  1.987754 -0.971906  1.900442\n",
      "362 -0.195155 -0.587276 -0.516338  0.164434\n",
      "363 -1.191338  1.043126 -0.125925 -0.702099\n",
      "364  0.885382  0.684500  1.409991  1.278488\n"
     ]
    }
   ],
   "source": [
    "tmpf = NamedTemporaryFile()\n",
    "df.to_hdf(tmpf.name, 'mydata', format='table', mode='w')\n",
    "print(pd.read_hdf(tmpf.name, 'mydata', where=['index>360']))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Reading and writing to Excel with Pandas"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Mean\n",
      " 0   -0.008083\n",
      "1    0.008668\n",
      "2    0.015442\n",
      "3   -0.061955\n",
      "dtype: float64\n"
     ]
    }
   ],
   "source": [
    "import numpy as np\n",
    "import pandas as pd\n",
    "from tempfile import NamedTemporaryFile\n",
    "\n",
    "tmpf = NamedTemporaryFile(suffix='.xlsx')\n",
    "np.random.seed(53)\n",
    "a = np.random.randn(365, 4)\n",
    "df = pd.DataFrame(a)\n",
    "df.to_excel(tmpf.name, sheet_name='Random Data')\n",
    "print(\"Mean\\n\", pd.read_excel(tmpf.name, 'Random Data').mean())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Using JSON"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Netherlands\n",
      "{\"country\": \"Brazil\", \"dma_code\": \"0\", \"timezone\": \"Europe/Amsterdam\", \"area_code\": \"0\", \"ip\": \"46.19.37.108\", \"asn\": \"AS196752\", \"continent_code\": \"EU\", \"isp\": \"Tilaa V.O.F.\", \"longitude\": 5.75, \"latitude\": 52.5, \"country_code\": \"NL\", \"country_code3\": \"NLD\"}\n"
     ]
    }
   ],
   "source": [
    "import json\n",
    "\n",
    "json_str=\"\"\"{\"country\":\"Netherlands\", \n",
    "        \"dma_code\":\"0\",\n",
    "        \"timezone\":\"Europe\\/Amsterdam\",\n",
    "        \"area_code\":\"0\",\n",
    "        \"ip\":\"46.19.37.108\",\n",
    "        \"asn\":\"AS196752\",\n",
    "        \"continent_code\":\"EU\",\n",
    "        \"isp\":\"Tilaa V.O.F.\",\n",
    "        \"longitude\":5.75,\n",
    "        \"latitude\":52.5,\n",
    "        \"country_code\":\"NL\",\n",
    "        \"country_code3\":\"NLD\"}\"\"\"\n",
    "data = json.loads(json_str)\n",
    "print(data['country'])\n",
    "data['country'] = 'Brazil'\n",
    "print(json.dumps(data))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "area_code                        0\n",
      "asn                       AS196752\n",
      "continent_code                  EU\n",
      "country                Netherlands\n",
      "country_code                    NL\n",
      "country_code3                  NLD\n",
      "dma_code                         0\n",
      "ip                    46.19.37.108\n",
      "isp                   Tilaa V.O.F.\n",
      "latitude                      52.5\n",
      "longitude                     5.75\n",
      "timezone          Europe/Amsterdam\n",
      "dtype: object\n",
      "{\"area_code\":\"0\",\"asn\":\"AS196752\",\"continent_code\":\"EU\",\"country\":\"China\",\"country_code\":\"NL\",\"country_code3\":\"NLD\",\"dma_code\":\"0\",\"ip\":\"46.19.37.108\",\"isp\":\"Tilaa V.O.F.\",\"latitude\":52.5,\"longitude\":5.75,\"timezone\":\"Europe\\/Amsterdam\"}\n"
     ]
    }
   ],
   "source": [
    "data = pd.read_json(json_str, typ='series')\n",
    "print(data)\n",
    "data['country'] = 'China'\n",
    "print(data.to_json())"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Parsing HTML with BeautifulSoup"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 32,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "First link:\n",
      " <a class=\"home-signup__button\" href=\"/?login=true\">\r\n",
      "                    Create an account\r\n",
      "                </a>\n"
     ]
    }
   ],
   "source": [
    "from bs4 import BeautifulSoup\n",
    "import requests\n",
    "import re\n",
    "\n",
    "html = requests.get('http://www.kaggle.com').content\n",
    "soup = BeautifulSoup(html, 'html.parser', from_encoding='utf-8')\n",
    "print(\"First link:\\n\", soup.a)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 33,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/?login=true\n"
     ]
    }
   ],
   "source": [
    "print(soup.a['href'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 36,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\r\n",
      "                    Create an account\r\n",
      "                 : /?login=true\n",
      "\r\n",
      "                    Host a competition\r\n",
      "                 : https://www.kaggle.com/host\n",
      "None : /jobs\n",
      "None : /competitions\n",
      "Want to host a competition? : /host\n",
      "None : /datasets\n",
      "None : /kernels\n",
      "None : /c/titanic\n",
      "None : /kernels\n",
      "None : http://blog.kaggle.com/\n",
      "None : https://www.kaggle.com/inclass\n",
      "\r\n",
      "                Create a profile\r\n",
      "             : /?login=true\n",
      "kernels : /kernels\n",
      "user ranking : /users\n",
      "forums : /forums\n",
      "Follow our jobs board : /jobs\n",
      "None : /competitions\n",
      "None : /datasets\n",
      "None : /kernels\n",
      "\r\n",
      "            Create an account\r\n",
      "         : /?login=true\n",
      "\r\n",
      "            Host a competition\r\n",
      "         : /host\n",
      "Our Team : /team\n",
      "Terms : /terms\n",
      "Privacy : /privacy\n",
      "Contact/Support : /contact\n"
     ]
    }
   ],
   "source": [
    "# soup('a') is same with soup.find_all('a')\n",
    "for link in soup('a'):\n",
    "    print(link.string, \":\", link.get('href'))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
